Variables:
Risk
Money
Security
Good time Help Success Proper Environment Tradition Creativity
Friends important Family important Leisure time Happiness Health (subjective) Satisfaction Freedom
Sex Age Country Wave Marital status Children Employment Education
library(data.table)
library(tidyr)
#read the data (Wave 5)
# Data of Wave 5
WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")
# Convert WV5_data-object in data.frame
WV5_data_df <- as.data.frame(WV5_data)
# show first five columns
head(WV5_data_df[, 1:5])
library(dplyr)
#rename the variables
WV5_data <- WV5_data_df %>%
rename(sex = V235, age = V237, country = V2, wave = V1, family_important = V4, friends_important = V5, leisure_time = V6, happiness = V10, health = V11, satisfaction = V22, freedom = V46, marital_status = V55, children = V56, creativity = V80, money = V81, security = V82, goodtime = V83, help = V84, success = V85, risk = V86, proper = V87, environment = V88, tradition = V89, employment = V241, education = V238)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(sex, age, country, wave, family_important, friends_important, leisure_time, happiness, health, satisfaction, freedom, marital_status, children, creativity, money, security, goodtime, help, success, risk, proper, environment, tradition, employment, education)
WV5_data
#decode the country names
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
Andorra Argentina Australia
1003 1002 1421
Brazil Bulgaria Burkina Faso
1500 1001 1534
Canada Chile China
2164 1000 1991
Colombia Cyprus (G) Egypt
3025 1050 3051
Ethiopia Finland France
1500 1014 1001
Georgia Germany Ghana
1500 2064 1534
Great Britain Guatemala Hong Kong
1041 1000 1252
Hungary India Indonesia
1007 2001 2015
Iran Iraq Italy
2667 2701 1012
Japan Jordan Malaysia
1096 1200 1201
Mali Mexico Moldova
1534 1560 1046
Morocco Netherlands New Zealand
1200 1050 954
Norway Peru Poland
1025 1500 1000
Romania Russia Rwanda
1776 2033 1507
Slovenia South Africa South Korea
1037 2988 1200
Spain Sweden Switzerland
1200 1003 1241
Taiwan Thailand Trinidad and Tobago
1227 1534 1002
Turkey Ukraine United States
1346 1000 1249
Uruguay Viet Nam Zambia
1000 1495 1500
WV5_data
NA
NA
#Read Dataset (Wave 6)
WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
` ``{r} #rename variables
WV6_data <- WV6_data %>%
rename(wave = V1, sex = V240, age = V242,country = V2, marital_status = V57, children = V58, employment = V229, education = V248, risk = V76, money = V71, security = V72, goodtime = V73, help = V74B, success = V75, proper = V77, environment = V78, tradition = V79, creativity = V70, family_important = V4, friends_important = V5, leisure_time = V6, happiness = V10, health = V11, satisfaction = V23, freedom = V55 )
#select only the variables of interest
WV6_data <- WV6_data %>%
select(sex, age, country, wave, marital_status, children, employment, education, risk, money, security, goodtime, help, success, proper, environment, tradition, creativity, family_important, friends_important, leisure_time, happiness, health, satisfaction, freedom)
WV6_data
NA
#decode daraset (Wave 6)
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
Algeria Argentina Armenia
1200 1030 1100
Australia Azerbaijan Belarus
1477 1002 1535
Brazil Chile China
1486 1000 2300
Colombia Cyprus (G) Ecuador
1512 1000 1202
Egypt Estonia Georgia
1523 1533 1202
Germany Ghana Haiti
2046 1552 1996
Hong Kong India Iraq
1000 4078 1200
Japan Jordan Kazakhstan
2443 1200 1500
Kuwait Kyrgyzstan Lebanon
1303 1500 1200
Libya Malaysia Mexico
2131 1300 2000
Morocco Netherlands New Zealand
1200 1902 841
Nigeria Pakistan Palestine
1759 1200 1000
Peru Philippines Poland
1210 1200 966
Qatar Romania Russia
1060 1503 2500
Rwanda Singapore Slovenia
1527 1972 1069
South Africa South Korea Spain
3531 1200 1189
Sweden Taiwan Thailand
1206 1238 1200
Trinidad and Tobago Tunisia Turkey
999 1205 1605
Ukraine United States Uruguay
1500 2232 1000
Uzbekistan Yemen Zimbabwe
1500 1000 1500
WV6_data
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data
#number of countries
length(unique(data$country_lab))
[1] 80
#number of participants
nrow(data)
[1] 173540
#exclusion of participants
data = subset(data, risk > 0 & sex > 0 & age > 0 & education > 0 & employment > 0 & marital_status > 0 & children >= 0 & family_important > 0 & friends_important > 0 & leisure_time > 0 & happiness > 0 & health > 0 & satisfaction > 0 & freedom > 0 & marital_status > 0 & creativity > 0 & money > 0 & security > 0 & goodtime >0 & help > 0 & success > 0, risk > 0 & proper > 0 & environment > 0 & tradition > 0 & employment > 0 & education > 0)
data
#number of males vs females (1 = males; 2 = females)
table(data$sex)
# Check the unique responses of each variable with frequencies
for (col_name in names(data)) {
response_table <- table(data[[col_name]])
print(paste("Response frequencies for", col_name, ":"))
print(response_table)
#create a categorical age variable
data$sex[data$sex == 1] <- "male"
data$sex[data$sex == 2] <- "female"
#gender variables
mean(data$age)
[1] 41.59569
#average age of participants
range(data$age)
[1] 15 99
#age range
library(ggplot2)
ggplot(data, aes(x = risk)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Risk Taking", y = "Frequency", title = "Histogram of Risk Taking") +
theme_minimal()
#risk taking Frequency
ggplot(data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()
#age frequency
ggplot(data, aes(x = agecat, y = risk)) +
geom_boxplot() +
labs(title = "Boxplot of Risk and Adventure by Age",
x = "Age",
y = "Risk and Adventure") +
theme_minimal()
NA
NA
#age vs risk taking
ggplot(data, aes(as.factor(sex), risk))+
geom_boxplot()
#sex vs risk taking
summary(data)
sex age country wave family_important friends_important leisure_time happiness health satisfaction
Length:149626 Min. :15.0 Min. : 12.0 Min. :5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000
Class :character 1st Qu.:28.0 1st Qu.:276.0 1st Qu.:5.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 5.000
Mode :character Median :39.0 Median :484.0 Median :6.000 Median : 1.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 2.000 Median : 7.000
Mean :41.6 Mean :481.5 Mean :5.552 Mean : 1.094 Mean : 1.661 Mean : 1.871 Mean : 1.865 Mean : 2.106 Mean : 6.755
3rd Qu.:53.0 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 2.000 3rd Qu.: 3.000 3rd Qu.: 8.000
Max. :99.0 Max. :894.0 Max. :6.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 4.000 Max. : 5.000 Max. :10.000
NA's :221 NA's :351 NA's :698 NA's :573 NA's :230 NA's :340
freedom marital_status children creativity money security goodtime help success risk
Min. :-5.000 Min. :1.000 Min. :0.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.00 Min. :-5.000 Min. :1.000
1st Qu.: 6.000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.: 2.000 1st Qu.:3.000
Median : 7.000 Median :1.000 Median :2.000 Median : 3.000 Median : 4.000 Median : 2.000 Median : 3.000 Median : 2.00 Median : 3.000 Median :4.000
Mean : 7.004 Mean :2.715 Mean :1.843 Mean : 2.718 Mean : 3.846 Mean : 2.374 Mean : 3.273 Mean : 2.29 Mean : 2.951 Mean :3.801
3rd Qu.: 9.000 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 5.000 3rd Qu.: 3.00 3rd Qu.: 4.000 3rd Qu.:5.000
Max. :10.000 Max. :6.000 Max. :8.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.00 Max. : 6.000 Max. :6.000
NA's :838 NA's :972 NA's :602 NA's :442 NA's :566 NA's :44862 NA's :703
proper environment tradition employment education country_lab agecat
Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :1.000 Min. :1.000 Length:149626 Length:149626
1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.:3.000 Class :character Class :character
Median : 2.000 Median : 2.000 Median : 2.000 Median :3.000 Median :5.000 Mode :character Mode :character
Mean : 2.533 Mean : 2.468 Mean : 2.511 Mean :3.406 Mean :5.501
3rd Qu.: 3.000 3rd Qu.: 3.000 3rd Qu.: 3.000 3rd Qu.:5.000 3rd Qu.:7.000
Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. :8.000 Max. :9.000
NA's :541 NA's :561 NA's :518
#data cleaning: deletion of NAs
data = na.omit(data)
summary(data)
sex age country wave family_important friends_important leisure_time
Length:101172 Min. :15.00 Min. : 12.0 Min. :5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000
Class :character 1st Qu.:27.00 1st Qu.:268.0 1st Qu.:5.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
Mode :character Median :39.00 Median :458.0 Median :5.000 Median : 1.000 Median : 2.000 Median : 2.000
Mean :41.11 Mean :474.4 Mean :5.348 Mean : 1.099 Mean : 1.652 Mean : 1.893
3rd Qu.:53.00 3rd Qu.:710.0 3rd Qu.:6.000 3rd Qu.: 1.000 3rd Qu.: 2.000 3rd Qu.: 2.000
Max. :99.00 Max. :894.0 Max. :6.000 Max. : 4.000 Max. : 4.000 Max. : 4.000
happiness health satisfaction freedom marital_status children creativity
Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.00 Min. :1.000 Min. :0.000 Min. :-5.000
1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 5.000 1st Qu.: 5.00 1st Qu.:1.000 1st Qu.:0.000 1st Qu.: 2.000
Median : 2.000 Median : 2.000 Median : 7.000 Median : 7.00 Median :1.000 Median :2.000 Median : 2.000
Mean : 1.889 Mean : 2.098 Mean : 6.692 Mean : 6.91 Mean :2.769 Mean :1.835 Mean : 2.699
3rd Qu.: 2.000 3rd Qu.: 3.000 3rd Qu.: 8.000 3rd Qu.: 9.00 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.: 4.000
Max. : 4.000 Max. : 5.000 Max. :10.000 Max. :10.00 Max. :6.000 Max. :8.000 Max. : 6.000
money security goodtime help success risk proper
Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :-5.000 Min. :1.000 Min. :-5.000
1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.:3.000 1st Qu.: 1.000
Median : 4.000 Median : 2.000 Median : 3.000 Median : 2.000 Median : 3.000 Median :4.000 Median : 2.000
Mean : 3.842 Mean : 2.363 Mean : 3.243 Mean : 2.281 Mean : 2.937 Mean :3.827 Mean : 2.538
3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 5.000 3rd Qu.: 3.000 3rd Qu.: 4.000 3rd Qu.:5.000 3rd Qu.: 3.000
Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. : 6.000 Max. :6.000 Max. : 6.000
environment tradition employment education country_lab agecat education_cat
Min. :-5.000 Min. :-5.00 Min. :1.000 Min. :1.000 Length:101172 Length:101172 Length:101172
1st Qu.: 2.000 1st Qu.: 1.00 1st Qu.:1.000 1st Qu.:3.000 Class :character Class :character Class :character
Median : 2.000 Median : 2.00 Median :3.000 Median :5.000 Mode :character Mode :character Mode :character
Mean : 2.452 Mean : 2.51 Mean :3.467 Mean :5.309
3rd Qu.: 3.000 3rd Qu.: 3.00 3rd Qu.:5.000 3rd Qu.:7.000
Max. : 6.000 Max. : 6.00 Max. :8.000 Max. :9.000
#ris vs education
ggplot(data, aes(risk, education))+
geom_point()+
geom_smooth(method = "lm")
model = lm(risk ~ education, data = data)
summary(model)
Call:
lm(formula = risk ~ education, data = data)
Residuals:
Min 1Q Median 3Q Max
-3.0532 -1.0532 0.1564 1.2612 2.3660
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.10560 0.01183 347.08 <2e-16 ***
education -0.05240 0.00202 -25.95 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.589 on 101170 degrees of freedom
Multiple R-squared: 0.00661, Adjusted R-squared: 0.0066
F-statistic: 673.1 on 1 and 101170 DF, p-value: < 2.2e-16
ggplot(data, aes(risk, freedom))+
geom_point()+
geom_smooth(method = "lm")
model1 = lm(risk ~ freedom, data = data)
summary(model1)
Call:
lm(formula = risk ~ freedom, data = data)
Residuals:
Min 1Q Median 3Q Max
-3.3968 -1.1100 0.1769 1.2247 2.3204
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.157773 0.014987 277.43 <2e-16 ***
freedom -0.047814 0.002045 -23.38 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.59 on 101170 degrees of freedom
Multiple R-squared: 0.005375, Adjusted R-squared: 0.005365
F-statistic: 546.7 on 1 and 101170 DF, p-value: < 2.2e-16
ggplot(data, aes(as.factor(wave), risk))+
geom_boxplot()
ggplot(data, aes(risk, age))+
geom_point()+
geom_smooth(method = "lm")
attach(data)
data$education_cat[education < 3] = "incomplete or no primary education"
data$education_cat[education > 2 & education <= 6] <- "no uni"
data$education_cat[education >= 7] <- "uni"
detach(data)
table(data$education)
1 2 3 4 5 6 7 8 9
9751 9603 18657 11323 29208 10845 24715 10923 24601
data
data$wave[data$wave == 5] <- "Wave 5"
data$sex[data$wave == 6] <- "Wave 6"
data
data$wave[data$wave == 5] <- "Wave 5"
data$sex[data$wave == 6] <- "Wave 6"
data
```